{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f9ee0bdb-b288-478d-9deb-bd157cfaf8f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>tickers</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1996-01-02</td>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1996-01-03</td>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1996-01-04</td>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1996-01-10</td>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1996-01-11</td>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date                                            tickers\n",
       "0 1996-01-02  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "1 1996-01-03  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "2 1996-01-04  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "3 1996-01-10  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "4 1996-01-11  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "df = pd.read_csv(\"S&P 500 Historical Components & Changes(07-12-2025).csv\")\n",
    "df['date'] = pd.to_datetime(df['date'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b1423720-3384-4d61-ae24-bca401d157d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get Unique Tickers\n",
    "unique_tickers = set()\n",
    "transformed_data = []\n",
    "for i, row in df.iterrows():\n",
    "    tickers = row['tickers'].split(',')\n",
    "    unique_tickers = unique_tickers.union(set(tickers))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e6f3434e-7c64-48e8-8169-c6950f9dacaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [05:04<00:00,  3.88it/s]\n"
     ]
    }
   ],
   "source": [
    "# Convert Point in Time to Ticker Start/End Records\n",
    "ticker_start_end_records = []\n",
    "for ticker in tqdm(unique_tickers):\n",
    "    ticker_record = None\n",
    "    for i, row in df.iterrows():\n",
    "        period_tickers = set(row['tickers'].split(','))\n",
    "        if ticker in period_tickers:\n",
    "            if ticker_record is None:\n",
    "                ticker_record = {\"ticker\": ticker, \"start_date\": row[\"date\"]}\n",
    "                continue\n",
    "        elif ticker_record is not None:\n",
    "            ticker_record.update({\"end_date\": row[\"date\"]})\n",
    "            ticker_start_end_records.append(ticker_record)\n",
    "            ticker_record = None\n",
    "\n",
    "    if ticker_record is not None:\n",
    "        ticker_start_end_records.append(ticker_record)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "27c4ed4d-f8a1-4b05-a785-bf1e4090317e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ticker</th>\n",
       "      <th>start_date</th>\n",
       "      <th>end_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HCP</td>\n",
       "      <td>2008-03-31</td>\n",
       "      <td>2019-11-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NCLH</td>\n",
       "      <td>2017-10-13</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>JEC</td>\n",
       "      <td>2007-10-26</td>\n",
       "      <td>2019-12-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HPC</td>\n",
       "      <td>1996-01-02</td>\n",
       "      <td>2008-11-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CBB</td>\n",
       "      <td>1996-01-02</td>\n",
       "      <td>1998-01-28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  ticker start_date   end_date\n",
       "0    HCP 2008-03-31 2019-11-05\n",
       "1   NCLH 2017-10-13        NaT\n",
       "2    JEC 2007-10-26 2019-12-10\n",
       "3    HPC 1996-01-02 2008-11-14\n",
       "4    CBB 1996-01-02 1998-01-28"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert back to dataframe\n",
    "ticker_record_df = pd.DataFrame(ticker_start_end_records)\n",
    "ticker_record_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f4aed634-93c0-4c36-8907-3cdcd53af73c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ticker\n",
       "COV    3\n",
       "OKE    2\n",
       "H      2\n",
       "HCA    2\n",
       "MIR    2\n",
       "ESV    2\n",
       "EQT    2\n",
       "HLT    2\n",
       "IR     2\n",
       "HP     2\n",
       "CNC    2\n",
       "HRS    2\n",
       "AN     2\n",
       "AMP    2\n",
       "AMD    2\n",
       "Name: start_date, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Look at top tickers entering and leaving S&P 500\n",
    "(\n",
    "    ticker_record_df.groupby(\"ticker\")[\"start_date\"]\n",
    "    .count().sort_values(ascending=False)\n",
    "    .head(15)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "30739501-b01d-4840-a03d-975ab2791d31",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Record to CSV\n",
    "(\n",
    "    ticker_record_df.sort_values([\"ticker\", \"start_date\"])\n",
    "    .to_csv(\"sp500_ticker_start_end.csv\", index=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "31288233-b806-4c0d-9105-0ff87da3c772",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1181"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ticker_record_df.ticker.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d1127e76-cc08-4146-98ad-e045ec431399",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1230"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ticker_record_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1b90c49b-0cb0-4772-89d3-1a3cbf7ef3a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Record list of tickers to JSON (Optional)\n",
    "# with open(\"sp_500_full.json\", \"w\") as f:\n",
    "#     json.dump(ticker_record_df.ticker.str.replace(\".\", \" \").to_list(), f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
